import scrapy
from ..items import JobInformationItem
from ..utils.key_word_match import KeywordMatch
from ..utils.logger import logger


class JobinformationSpider(scrapy.Spider):
    name = "JobInformation"

    def __init__(self, page=None, **kwargs):
        super().__init__(**kwargs)
        self.page = page

    def start_requests(self):
        allowed_domains = ["www.chinagwy.org"]
        if self.page:
            url_temp = ["https://www.chinagwy.org/html/zkgg/3_" + str(i) + '.html' for i in
                        range(2, int(self.page) + 1)]
        else:
            url_temp = ["https://www.chinagwy.org/html/zkgg/3_" + str(i) + '.html' for i in range(2, 20)]
        url_temp.reverse()
        url_temp.append("https://www.chinagwy.org/html/zkgg/index.html")
        start_urls = url_temp
        self.start_urls = start_urls
        logger.info("拼接后所有的url : " + str(self.start_urls))
        logger.info("爬取个数 : " + str(len(self.start_urls)))
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        item = JobInformationItem()
        lis = response.xpath("/html/body/div/div[2]/div[1]/div[2]/div[2]/div[2]/ul/li")
        for li in lis:
            if len(li.xpath("a/@href").extract()) > 1:
                item['exam_type'] = li.xpath("a/text()").extract()[0].strip()
                item['url'] = li.xpath("a/@href").extract()[1].strip()
            if len(li.xpath("span/text()").extract()) > 0:
                item['publish_at'] = li.xpath("span/text()").extract()[0].strip()
            if len(li.xpath("a/@title").extract()) > 0:
                item['title'] = li.xpath("a/@title").extract()[0].strip()
            item['key_word'] = KeywordMatch().key_word_match(item['title'])
            logger.info("解析出来的数据是: " + str(item))
            yield item
